library(pollstR)
library(XML)
library(reshape)
library(ggplot2)
library(WDI)
library(lme4)

## FIGURE S20
#predicting the US election
#need for smoothing: polls, pro-incumbent score, growth, pollster, country, region
#need for outcome: polity(10), inflation rate (.8 pct)

##########
# ::::::: APSA predictions :::::
# This model used a more limited smoothing stage compared to the model presented in the paper. 
# We have improved on estimation of the variability, see predict_US_elections_updated.R
#######

######### POLLS ################## from RealClearPolitics
setwd("xxxxx")
pls <- read.csv("2016_US_presidential.txt", sep="\t", fileEncoding = "UTF-16")

#getting the correct period of the polls
pls$d1 <- as.Date(gsub(" ", "", paste(substr(pls$Date, start=1, stop=4), "/16", sep="")), format="%m/%d/%y")
pls$d2 <- as.Date(gsub(" ", "", paste(substr(pls$Date, start=7, stop=11), "/16", sep="")), format="%m/%d/%y")
pls$date <- pls$d1 + round((pls$d2-pls$d1)/2)
pls$days.to.elec <- as.Date("2016-11-07") - pls$date 
pls$wts = scale(1/(as.numeric(pls$days.to.elec+2)), center=F)

# Smoothed estimates
dat <- read.csv("~/Downloads/Science Replication and Data - Polling/global_polling_replication_data.csv")
dat$wts <- scale(1/(dat$days.to.elec+2), center=F) #+ scale(1/((abs(dat$pro.incumbent)+1) ), center=F) +scale(dat$n.elecs, center=F) #should do this by election
dat$pro.incumbent <- scale(dat$pro.incumbent, center=F)
sm.mod <- lmer(pollmarg ~ pro.incumbent + (pro.incumbent|incRun) + (1|asp.gdp.bin) + (1|electionid)+(1|Pollster), na.action=na.exclude, weights=wts, data=dat, control=lmerControl(optimizer="bobyqa", boundary.tol = 1e-2, check.scaleX="silent.rescale"))
dat$sm.pollmarg <- predict(sm.mod)

# Aggregate and predict
library(caret); set.seed(13243)
sm.agg <- aggregate(cbind(sm.pollmarg, pollmarg, realmarg, year, region=as.numeric(as.factor(region)), incRun, incApp, incExtend,  multiparty, polity2, l1polity2, wb.inflation, asp.gdp, Round=as.numeric(as.factor(Round)))~electionid, median, data=dat)

trcontrol <- trainControl(method="timeslice", initialWindow = 50, horizon=1, fixedWindow=F, savePredictions = T)# , indexOut = indexOut
sm.agg <- sm.agg[order(sm.agg$year), ]
tr.stru <- train(realmarg~sm.pollmarg+l1polity2+wb.inflation, data=sm.agg, trControl=trcontrol, metric="RMSE", method="pls", tuneGrid=expand.grid(ncomp=2)) 

## USA DATA:: about a 7.3 margin in the popular vote
n.reps = 150
preds <- numeric(n.reps)

# 150 draws of random samples of the polls to inherit their variability, then plug those numbers into the second stage
set.seed(22314)
for(i in 1:n.reps){
  mod <- lmer(Spread~(1|Poll), weights = wts, data=pls[sample(1:nrow(pls), nrow(pls)-10, replace = T), ])
  pls$pred <- predict(mod, allow.new.levels=T, newdata=pls)
  
  USdf <- data.frame(sm.pollmarg = median(pls$pred), l1polity2 = 10, wb.inflation = .84)
  
  preds[i] = predict(tr.stru$finalModel, newdat=USdf)[,,2]
}

# Probability based on the resamples of the learning model:
pnorm(7.3, mean=0, sd=7.2)# about 84% Hillary
# Which is more aggressive than 538, which puts the probability at 74%. 
set.seed(7523)
probs <- rnorm(3000, mean=7.3, sd=7.2)
qplot(probs, fill=I("blue")) + geom_vline(xintercept = 0) + xlab("Hillary Simulated Margin") + ylab("Frequency")
